library(data.table)
library(ggplot2)
library(dplyr)
library(ggrepel)

Introduction

This R Markdown creates a Manhattan plot where each gene is represented by its lead cis-eQTL (minimum normial p values). It also plots a TSS distance histogram.

Load data

ciseQTLs = fread(params$ciseQTLs_path)
location = fread(params$snpsloc_path)

# merge
data = merge(location, ciseQTLs, by = "snp", all = FALSE)

# Standardise columns
if (!("chr" %in% names(data))) {
  # try alternatives
  if ("CHR" %in% names(data)) data$chr <- data$CHR
}
data$name = paste0(data$snp, "_", data$gene_name)
keep_cols <- intersect(c("chr","pos","snp","gene_name","name","p"), names(data))
data = data[, ..keep_cols]
setnames(data, names(data), tolower(names(data)))
setnames(data, c("chr","pos","snp","gene_name","name","p"), c("chr","bp","risd","gene_name","snp_id","p"), skip_absent=TRUE)

# coerce types
data$chr <- as.numeric(gsub("chr", "", as.character(data$chr)))
data$bp <- as.numeric(as.character(data$bp))
data <- data[!is.na(data$chr) & !is.na(data$bp)]

Add gene locations and compute distance to TSS

if (!file.exists(params$geneloc_path)) stop("geneloc file not found: ", params$geneloc_path)
geneloc <- fread(params$geneloc_path)

# pick gene_name column robustly
if ("V7" %in% names(geneloc) & "V8" %in% names(geneloc)) {
  geneloc$gene_name = ifelse(geneloc$V7 %in% c("KNOWN","NOVEL"), geneloc$V8, geneloc$V7)
} else if ("gene_name" %in% names(geneloc)) {
  geneloc$gene_name = geneloc$gene_name
} else {
  stop("Unexpected geneloc format — cannot determine gene name column")
}

geneloc = geneloc[!duplicated(geneloc$gene_name),]

yy = merge(data, geneloc, by = "gene_name", all.x = TRUE)
if (nrow(yy) == 0) stop("Merge with gene locations produced 0 rows — check gene names")

# compute TSS depending on strand column V5 or strand if present
strand_col <- if ("V5" %in% names(yy)) "V5" else if ("strand" %in% names(yy)) "strand" else NA
start_col <- if ("V2" %in% names(yy)) "V2" else if ("start" %in% names(yy)) "start" else NA
end_col <- if ("V3" %in% names(yy)) "V3" else if ("end" %in% names(yy)) "end" else NA

if (is.na(strand_col) | is.na(start_col) | is.na(end_col)) {
  warning("Gene location file missing expected columns for TSS calculation. TSS will be NA where not computable.")
  yy$gene_tss <- NA
} else {
  yy$gene_tss = ifelse(yy[[strand_col]] == "+", yy[[start_col]], yy[[end_col]])
}

yy$distance_kb = (yy$gene_tss - yy$bp) / 1000
yy$distance_bp = yy$gene_tss - yy$bp

save_table = yy[, .(gene_name, chr, bp, risd, snp_id, gene_tss, distance_bp)]
setnames(save_table, c("gene_name","chr","bp","risd","snp_id","gene_tss","distance_bp"),
         c("gene_name","CHR","BP(hg19)","SNP","Pair","gene_tss","distance(bp)"))
#write.table(save_table, "PC47_LMM.resul_TSS.distance.txt", row.names = FALSE, quote = FALSE, sep = "\t")
cat("Wrote PC47_LMM.resul_TSS.distance.txt with", nrow(save_table), "rows\n")
## Wrote PC47_LMM.resul_TSS.distance.txt with 1160107 rows

Distance distribution plot

dtss = yy$distance_kb
dtss = dtss[!is.na(dtss)]
if (length(dtss) == 0) {
  plot.new(); text(0.5,0.5,"No TSS distances available")
} else {
  h <- hist(dtss, breaks=100, col="#c77cff",
            main="Distance distribution of cis-eQTLs",
            xlab="Distance between eQTL and TSS (KB)")
  xfit <- seq(min(dtss), max(dtss), length=40)
  yfit <- dnorm(xfit, mean=mean(dtss), sd=sd(dtss))
  yfit <- yfit * diff(h$mids[1:2]) * length(dtss)
  lines(xfit, yfit, col="grey", lwd=2)
}

subsample non-significant points (speeds up plotting)

sig_cut <- 0.001
sig.dat <- data %>% filter(p < sig_cut)
notsig.dat <- data %>% filter(p >= sig_cut)

prop <- params$subsample_non_sig_prop
if (nrow(notsig.dat) > 0) {
  notsig.sample <- notsig.dat %>% slice_sample(prop = prop)
  data2 <- bind_rows(sig.dat, notsig.sample)
} else {
  data2 <- data
}
data2$BP <- as.numeric(data2$bp)
data2$CHR <- as.integer(data2$chr)

Compute cumulative positions for Manhattan plot

don <- data2 %>%
  group_by(CHR) %>%
  summarise(chr_len = max(BP, na.rm = TRUE)) %>%
  mutate(tot = cumsum(chr_len) - chr_len) %>%
  select(-chr_len) %>%
  left_join(data2, ., by = c("CHR" = "CHR")) %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP + tot)

axisdf = don %>% group_by(CHR) %>% summarize(center = (max(BPcum, na.rm=TRUE) + min(BPcum, na.rm=TRUE)) / 2)

Highlight TST genes and plot

if (file.exists(params$TST_genes_path)) {
  TST <- fread(params$TST_genes_path)
  if ("MaxFC" %in% names(TST)) TST <- TST[TST$MaxFC >= 0.58, ]
  if ("HGNC" %in% names(TST)) TST$gene_name = TST$HGNC
} else {
  TST <- data.table(gene_name = character(0))
}

# pick lead SNP per gene (minimum p-value)
don$p <- as.numeric(as.character(don$p))
cat(print(min(don$p, na.rm=TRUE)))
## [1] 4.265115e-153
## 4.265115e-153
don2 <- setDT(don)[, .SD[which.min(p)], by = gene_name]

cat("Lead SNPs selected:", nrow(don2), "unique genes\n")
## Lead SNPs selected: 17033 unique genes
cat("Minimum p-value in plot data:", min(don2$p, na.rm = TRUE), "\n")
## Minimum p-value in plot data: 4.265115e-153
don2$p <- as.numeric(as.character(don2$p))
don2$log10p <- -log10(don2$p)

g <- ggplot(don2, aes(x = BPcum, y = log10p)) +
  geom_point(aes(color = as.factor(CHR)), alpha = 0.6, size = 0.6) +
  scale_color_manual(values = rep(c("grey", "black"), 22)) +
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  ylab(expression("-Log"["10"]~italic("P"))) + xlab("Genomic position (Chr)") +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.text.x = element_text(angle = 30, hjust = 1),
        plot.title = element_text(face = "bold")) +
  ggtitle("HIRV_LMM lead cis-eQTLs (lead SNP per gene)")+
  geom_point(data = don2[don2$gene_name %in% TST$gene_name,], col = "red", alpha = 1, size = 0.6) +
  geom_text_repel(data = don2[don2$gene_name %in% TST$gene_name & don2$p < 1e-50,],
                  aes(label = gene_name), size = 3.5, min.segment.length = 0, max.overlaps = Inf)

print(g)

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.6     dplyr_1.1.4       ggplot2_3.5.1     data.table_1.16.2
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6      jsonlite_1.8.7    compiler_4.3.2    highr_0.10       
##  [5] tidyselect_1.2.1  Rcpp_1.0.13-1     jquerylib_0.1.4   scales_1.3.0     
##  [9] yaml_2.3.7        fastmap_1.1.1     R6_2.5.1          labeling_0.4.3   
## [13] generics_0.1.3    knitr_1.45        tibble_3.2.1      munsell_0.5.1    
## [17] bslib_0.5.1       pillar_1.9.0      R.utils_2.12.3    rlang_1.1.4      
## [21] utf8_1.2.4        cachem_1.0.8      xfun_0.41         sass_0.4.7       
## [25] cli_3.6.3         withr_3.0.2       magrittr_2.0.3    digest_0.6.33    
## [29] grid_4.3.2        lifecycle_1.0.4   R.methodsS3_1.8.2 R.oo_1.27.0      
## [33] vctrs_0.6.5       evaluate_0.23     glue_1.8.0        farver_2.1.2     
## [37] fansi_1.0.6       colorspace_2.1-1  rmarkdown_2.25    tools_4.3.2      
## [41] pkgconfig_2.0.3   htmltools_0.5.7